I did this a bit flippantly before, but I want to fomalize the process by which we estimate the uncertainty on emulator predictions.



In [63]:

    
from pearce.emulator import SpicyBuffalo, LemonPepperWet
from pearce.mocks import cat_dict
import numpy as np
from os import path



In [64]:

    
import matplotlib
#matplotlib.use('Agg')
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()



In [65]:

    
#xi gg
training_file = '/scratch/users/swmclau2/xi_zheng07_cosmo_lowmsat/PearceRedMagicXiCosmoFixedNd.hdf5'
#test_file= '/scratch/users/swmclau2/xi_zheng07_cosmo_test_lowmsat2/'
test_file =  '/scratch/users/swmclau2/xi_zheng07_cosmo_test_lowmsat2/PearceRedMagicXiCosmoFixedNd_Test.hdf5'

#xi gm training_file = '/scratch/users/swmclau2/xi_gm_cosmo/PearceRedMagicXiGMCosmoFixedNd.hdf5' test_file = '/scratch/users/swmclau2/xi_gm_cosmo_test2/PearceRedMagicXiGMCosmoFixedNdTest.hdf5'



In [66]:

    
em_method = 'gp'
split_method = 'random'



In [67]:

    
a = 1.0
z = 1.0/a - 1.0



In [68]:

    
fixed_params = {'z':z}#, 'cosmo': 0}#, 'r':24.06822623}

hp = np.loadtxt('/home/users/swmclau2/Git/pearce/bin/optimization/sloppy_joes_result_indiv_bins.npy')



In [69]:

    
from glob import glob



In [70]:

    
hp = []
for fname in sorted(glob('/home/users/swmclau2/Git/pearce/bin/optimization/sloppy_joes_indiv_bins/sloppy*.npy')):
    #print fname, len(hp)
    hp.append(np.loadtxt(fname))



In [71]:

    
len(hp)









    Out[71]:





18



In [72]:

    
param_names = ['ombh2', 'omch2', 'w0', 'ns', 'ln10As', 'H0', 'Neff', 'logM0', 'sigma_logM', 'logM1', 'alpha']



In [73]:

    
pnames = ['bias', 'amp']
pnames.extend(param_names)
pnames.append('amp')
pnames.extend(param_names)

from collections import defaultdict metric = defaultdict(list) for val, pname in zip(hp, pnames): metric[pname].append(val)



In [74]:

    
from collections import defaultdict
metric = []

for _hp in hp:
    metric.append(defaultdict(list))
    for val, pname in zip(_hp, pnames):
        metric[-1][pname].append(val)

np.random.seed(0) emu = SpicyBuffalo(training_file, method = em_method, fixed_params=fixed_params, custom_mean_function = 'linear', downsample_factor = 0.05, hyperparams = {'metric':metric})



In [75]:

    
np.random.seed(0)
emu = LemonPepperWet(training_file, method = em_method, fixed_params=fixed_params,
                 custom_mean_function = None, downsample_factor = 0.1)#, hyperparams = {'metric':metric})



In [76]:

    
emu.get_param_names()









    Out[76]:





['ombh2',
 'omch2',
 'w0',
 'ns',
 'ln10As',
 'H0',
 'Neff',
 'logM0',
 'sigma_logM',
 'logM1',
 'alpha']

test_x, test_y, test_ycov, info = emu.get_data(test_file, emu.fixed_params, attach_params = False)

gof = emu.goodness_of_fit(test_file, statistic = 'frac') #print gof.mean(), np.median(gof) for g in gof: print g.mean(), np.median(g)



In [77]:

    
pred_y, data_y = emu.goodness_of_fit(test_file, statistic = None)



In [78]:

    
print len(pred_y[0])



In [79]:

    
test_x, _, test_err, _ = emu.get_data(test_file, emu.fixed_params)

t, old_idxs  = emu._whiten(test_x)



In [80]:

    
train_x, train_y, train_err, info = emu.get_data(training_file, emu.fixed_params)









    



/home/users/swmclau2/.local/lib/python2.7/site-packages/pearce/emulator/emu.py:294: UserWarning: WARNING: NaN detected. Skipped 21 points in training data.
  warnings.warn('WARNING: NaN detected. Skipped %d points in training data.' % (num_skipped))



In [ ]:



In [81]:

    
len(train_err)









    Out[81]:





40000



In [82]:

    
mean_func_at_params = emu.mean_function(t)



In [83]:

    
mean_func_at_params.shape









    Out[83]:





(18,)

mean_func_at_params = self.mean_function(t) if self._downsample_factor == 1.0: y = self.y else: y = self.downsample_y mu = [] err = [] for bin_no, (t_in_bin, mfc,_y, emulator) in enumerate(izip(t, mean_func_at_params, y, self._emulators)): #print '_y', _y if self.method == 'gp': if gp_errs: local_mu, local_err = emulator.predict(_y, t_in_bin, return_cov = False,return_var=True) else: #print _y #print t_in_bin local_mu = emulator.predict(_y, t_in_bin, return_cov = False,return_var=False) #print local_mu local_err = np.ones_like(local_mu) else: local_mu = emulator.predict(t_in_bin) local_err = np.ones_like(local_mu) # weight with this instead of the errors. #print 'local_mu, mfc', local_mu, mfc mu.append(self._y_std[bin_no]*(local_mu + mfc) + self._y_mean[bin_no]) err.append(local_err*self._y_std[bin_no]) #now, figure out how to return to the shape of t combined_mu = np.zeros((t_size,)) combined_err = np.zeros((t_size,)) for r_idx, bin_idxs in enumerate(old_idxs): combined_mu[bin_idxs] = mu[r_idx] combined_err[bin_idxs] = err[r_idx] # Reshape to be consistent with my other implementation if not gp_errs: return combined_mu return combined_mu, combined_err



In [84]:

    
plt_idx = 4249
print t[0][plt_idx]
print pred_y[0][plt_idx]
print data_y[0][plt_idx]
#print mean_func_at_params[0][plt_idx]









    



[ 1.33619946 -1.41073566  1.2937706   1.09007699 -0.99425337 -1.29157473
 -1.01915118 -0.62179221 -0.20608562  0.74996107 -0.50398343]
3.86065096638
3.99002394681

emu._emulators[0].predict(np.array([t[0][plt_idx]]), emu.downsample_y[0])

for plt_idx in np.random.choice(len(pred_y[0]), 100): print plt_idx pred_plot = np.array([pred_y[bi][plt_idx] for bi in xrange(len(pred_y))]) data_plot = np.array([data_y[bi][plt_idx] for bi in xrange(len(pred_y))]) mean_plot = np.array([mean_func_at_params[bi][plt_idx] for bi in xrange(len(pred_y))]) #mean_plot = np.array([mean_func_at_params[bi] for bi in xrange(len(pred_y))]) plt.plot(emu.scale_bin_centers, pred_plot, label = "Emu") plt.plot(emu.scale_bin_centers, data_plot, label = "Data") plt.plot(emu.scale_bin_centers, mean_plot, label = "Mean") plt.xscale('log') plt.legend(loc = 'best') plt.show()

hod_no = 10 cosmo_no = 2 #for cosmo_no in xrange(7): for realization_no in xrange(5): for idx in xrange(len(pred_y)): py = pred_y[idx][cosmo_no*5+realization_no+hod_no::1000] dy = data_y[idx][cosmo_no*5+realization_no+hod_no::1000] diffmat = np.abs(10**py-10**dy) denommat = 10**dy acc = diffmat/denommat mean_acc = np.mean(acc) print mean_acc, #plt.vlines(mean_acc, 0, 000) #plt.hist(acc, bins = np.logspace(-3, 3, 30)) #plt.xscale('log') #plt.show() print

resmat = np.zeros( (len(pred_y), pred_y[-1].shape[0])) for bin_no in xrange(len(pred_y)): py, dy = pred_y[bin_no], data_y[bin_no] resmat[bin_no, :py.shape[0]] = 10**py - 10**dy



In [85]:

    
resmat = np.zeros(( 7, 5, 1000, 18))
resmat_log = np.zeros_like(resmat)
predmat = np.zeros(( 7, 5, 1000, 18))

datamat = np.zeros(( 7, 5, 1000, 18))

for bin_no in xrange(len(pred_y)):
    py, dy = pred_y[bin_no], data_y[bin_no]
    for cosmo_no in xrange(7):
        cpy, cdy = py[cosmo_no*5000:(cosmo_no+1)*5000], dy[cosmo_no*5000:(cosmo_no+1)*5000]
        #for realization_no in xrange(5):
        for hod_no in xrange(1000):
            crpy, crdy = cpy[hod_no::1000], cdy[hod_no::1000]

            #for hod_no in xrange(1000):
            #print crpy.std()
            predmat[cosmo_no, :,  hod_no , bin_no] = crpy
            datamat[cosmo_no, :,  hod_no , bin_no] = crdy
            resmat[cosmo_no, :,  hod_no , bin_no] = 10**crpy - 10**crdy
            resmat_log[cosmo_no, :,  hod_no , bin_no] = crpy - crdy



In [86]:

    
resmat.shape









    Out[86]:





(7, 5, 1000, 18)



In [87]:

    
N = 5
for cosmo_no in xrange(7):
    for hod_no in xrange(1000):
        r = resmat[cosmo_no, :, hod_no].reshape((-1, 18), order = 'C').T
        print r.shape
        scovmat = r.dot(r.T)/r.shape[1]
        #fig = plt.figure(figsize = (10, 6))
        #plt.subplot(121)
        im = plt.imshow(np.log10(np.abs(scovmat) ))
        plt.colorbar(im)
        #plt.subplot(122)
        #pred = 10**predmat[cosmo_no, :, hod_no].reshape((-1, 18), order = 'C').T
        #print pred.shape
        #data = datamat[cosmo_no, :, hod_no].reshape((-1, 18), order = 'C').T
        #scovmat2 = np.cov(pred)
        #im = plt.imshow(np.log10(np.abs(scovmat2) ), cmap = 'viridis', vmin = -5,vmax = 9)
        #plt.colorbar(im)
        
        plt.show()
        N-=1
        
        if N == 0:
            break
    if N==0:
        break



In [88]:

    
N = 50
for cosmo_no in xrange(7):
    for hod_no in xrange(1000):
        
        pred = 10**predmat[cosmo_no, :, hod_no].reshape((-1, 18), order = 'C')
        data = 10**datamat[cosmo_no, :, hod_no].reshape((-1, 18), order = 'C')
        for p,d in zip(pred,data):
            plt.plot(emu.scale_bin_centers, np.abs(p-d)/d)
        #for d in data:
        #    plt.plot(emu.scale_bin_centers, d)
        
        plt.xscale('log')
        plt.yscale('log')
        plt.show()
        N-=1
        
        if N == 0:
            break
    if N==0:
        break



In [89]:

    
resmat_flat = resmat.reshape((-1, 18)).T
datamat_flat = datamat.reshape((-1, 18)).T
#resmat_hodrealav = np.mean(resmat_realav, axis = 1)



In [90]:

    
r_idx = 10
t_bin = t[r_idx]
acc_bin = np.abs(resmat_flat[r_idx])/datamat_flat[r_idx]



In [91]:

    
t_bin.shape, acc_bin.shape









    Out[91]:





((35000, 11), (35000,))



In [92]:

    
percentiles = np.percentile(acc_bin, range(101))
norm_acc_bin = np.digitize(acc_bin, percentiles)
#norm_acc_bin = 100*((acc_bin - acc_bin.min())/acc_bin.max()).astype(int)



In [93]:

    
palette = sns.diverging_palette(220, 20, n=len(percentiles)-1, as_cmap=True)
#sns.set_palette(palette)



In [94]:

    
pnames = emu.get_param_names()



In [95]:

    
for axes1 in xrange(7,11):
    for axes2 in xrange(axes1+1, 11):
        cbar = plt.scatter(t_bin[:,axes1 ], t_bin[:,axes2], c = norm_acc_bin,cmap = palette, alpha = 0.2)
        plt.colorbar(cbar)
        plt.xlabel(pnames[axes1])
        plt.ylabel(pnames[axes2])
        #plt.gray()
        plt.show()



In [96]:

    
test_err_bin = test_err[:, r_idx, r_idx]



In [97]:

    
plt.hist(np.log10(test_err_bin) )









    Out[97]:





(array([   21.,   136.,   635.,  2505.,  6071.,  9794.,  9807.,  4954.,
          997.,    80.]),
 array([-6.4880701 , -6.23495877, -5.98184744, -5.72873611, -5.47562478,
        -5.22251345, -4.96940212, -4.71629079, -4.46317947, -4.21006814,
        -3.95695681]),
 <a list of 10 Patch objects>)

Why are these different?



In [98]:

    
plt.hist(np.log10(np.sqrt(emu.yerr[r_idx])) )









    Out[98]:





(array([   164.,   2085.,   8315.,  10257.,   8919.,   5685.,   2963.,
          1287.,    289.,     36.]),
 array([-1.41924799, -1.30072147, -1.18219495, -1.06366842, -0.9451419 ,
        -0.82661538, -0.70808885, -0.58956233, -0.47103581, -0.35250928,
        -0.23398276]),
 <a list of 10 Patch objects>)



In [99]:

    
len(emu.yerr)









    Out[99]:





18



In [100]:

    
percentiles = np.percentile(np.log10(test_err_bin), [0, 10,20,30,40,50,60,70,80,90,100])
norm_err_bin = np.digitize(np.log10(test_err_bin), percentiles)



In [101]:

    
#relevant stat is uncertainty in training error, not test error
for axes1 in xrange(7,11):
    for axes2 in xrange(axes1+1, 11):
        cbar = plt.scatter(t_bin[:,axes1 ], t_bin[:,axes2], c = norm_err_bin,cmap = palette, alpha = 0.2)
        plt.colorbar(cbar)
        plt.xlabel(pnames[axes1])
        plt.ylabel(pnames[axes2])
        #plt.gray()
        plt.show()



In [102]:

    
test_err_diag= np.array([test_err[:, r_idx, r_idx] for r_idx in xrange(emu.n_bins)] )



In [103]:

    
test_err_diag.mean(axis = 1)









    Out[103]:





array([  6.66416262e-04,   3.90266113e-04,   2.40298112e-04,
         1.59608324e-04,   1.11817891e-04,   8.52685284e-05,
         6.85463332e-05,   3.27251532e-05,   1.61159551e-05,
         1.23840494e-05,   1.22300077e-05,   1.20386761e-05,
         8.81174492e-06,   6.19478076e-06,   5.05835371e-06,
         4.66060676e-06,   5.20829565e-06,   7.42500106e-06])



In [112]:

    
np.sqrt(np.mean(np.square(np.abs(resmat_flat)/(10**datamat_flat) )))









    Out[112]:





0.084350297620042369



In [104]:

    
plt.plot(emu.scale_bin_centers, np.mean(np.abs(resmat_flat)/(10**datamat_flat), axis = 1) )
plt.plot(emu.scale_bin_centers, np.mean(np.abs(test_err_diag), axis = 1) )

plt.xscale('log')



In [105]:

    
resmat_log_flat = resmat_log.reshape((-1, 18)).T

emu.yerr[0].mean(), emu.y[0]



In [106]:

    
np.mean(np.array([ye/np.abs(y+1e-9) for ye, y in zip(emu.yerr, emu.y)]) , axis = 1)









    Out[106]:





array([  6.88971478e+02,   6.13597662e-03,   6.64106546e-03,
         7.28736319e-03,   8.16241541e-03,   9.42955257e-03,
         7.25344587e+01,   1.10902183e-02,   1.14167158e-02,
         1.31384973e-02,   1.63399965e-02,   2.17118471e-02,
         3.10878552e-02,   5.23228689e-02,   2.06445287e-01,
         8.96476256e-01,   1.11314614e-01,   3.06048094e-02])



In [107]:

    
plt.plot(emu.scale_bin_centers, np.mean(np.abs(resmat_log_flat)/np.abs(datamat_flat), axis = 1), label = 'Pred acc' )
#plt.plot(emu.scale_bin_centers, np.mean(np.array([ye/np.abs(y+1e-9) for ye, y in zip(emu.yerr, emu.y)]), axis = 1), label = 'Training Err' )
plt.yscale('log')
plt.legend(loc ='best')
plt.xscale('log')



In [108]:

    
for res, dat in zip(resmat_flat, 10**datamat_flat):
    plt.hist(res/dat, bins = 30)#, bins = np.linspace(-1, 1, 30))
    plt.yscale('log')
    plt.show()



In [109]:

    
scovmat = resmat_flat.dot(resmat_flat.T)/(len(pred_y[0])-1)



In [110]:

    
im = plt.imshow(np.log10(scovmat) )
plt.colorbar(im)
plt.show()



In [111]:

    
print np.sqrt(np.diag(scovmat))









    



[  6.82852886e+03   4.22204695e+03   2.60080324e+03   1.57332998e+03
   9.29063406e+02   5.35138494e+02   2.99363194e+02   1.61047184e+02
   8.45135299e+01   3.97037362e+01   1.44902002e+01   3.71410566e+00
   8.29729209e-01   2.73203089e-01   1.34804020e-01   6.63547077e-02
   3.28807034e-02   2.44989037e-02]

# xi_gg [ 5.24570257e+03 3.23374673e+03 1.87386389e+03 1.21527448e+03 6.75245722e+02 3.90243992e+02 2.29159542e+02 1.35977239e+02 7.92194628e+01 3.78496692e+01 1.44829661e+01 4.31545126e+00 8.90656761e-01 2.21231511e-01 1.58627067e-01 5.99630335e-02 3.33024301e-02 2.38020596e-02]

# xi_gm [ 6.49230284e+02 2.93763883e+02 1.38709271e+02 6.46778563e+01 3.78016132e+01 2.51962597e+01 1.64736353e+01 1.11197889e+01 6.22022662e+00 2.74484762e+00 1.04167114e+00 3.10671790e-01 9.63501742e-02 5.49056973e-02 3.15483462e-02 1.63669320e-02 8.80627565e-03 5.35716884e-03]



In [ ]:



In [ ]:



In [ ]: